1 Effect of UPSTM-Based Decorrelation on Feature Discovery

1.0.1 Loading the libraries

library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)

op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)

1.1 Material and Methods

1.2 The Data



#data('gravier', package = 'datamicroarray')
#table(gravier$y)

load("~/GitHub/datamicroarray/data/gravier.RData")


gravierset <- as.data.frame(cbind(class=1*(gravier$y=="poor"),gravier$x))

gravier <- NULL

1.2.0.1 Standarize the names for the reporting

studyName <- "GRAVIER"
dataframe <- gravierset
outcome <- "class"

TopVariables <- 10

thro <- 0.80
cexheat = 0.15

1.3 Generaring the report

1.3.1 Libraries

Some libraries

library(psych)
library(whitening)
library("vioplot")
library("rpart")

1.3.2 Data specs

pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
rows col
168 2905
pander::pander(table(dataframe[,outcome]))
0 1
111 57

varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]

largeSet <- length(varlist) > 1500 

1.3.3 Scaling the data

Scaling and removing near zero variance columns and highly co-linear(r>0.99999) columns


  ### Some global cleaning
  sdiszero <- apply(dataframe,2,sd) > 1.0e-16
  dataframe <- dataframe[,sdiszero]

  varlist <- colnames(dataframe)[colnames(dataframe) != outcome]
  tokeep <- c(as.character(correlated_Remove(dataframe,varlist,thr=0.99999)),outcome)
  dataframe <- dataframe[,tokeep]

  varlist <- colnames(dataframe)
  varlist <- varlist[varlist != outcome]
  
  iscontinous <- sapply(apply(dataframe,2,unique),length) >= 5 ## Only variables with enough samples



dataframeScaled <- FRESAScale(dataframe,method="OrderLogit")$scaledData

1.4 The heatmap of the data

numsub <- nrow(dataframe)
if (numsub > 1000) numsub <- 1000


if (!largeSet)
{

  hm <- heatMaps(data=dataframeScaled[1:numsub,],
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 xlab="Feature",
                 ylab="Sample",
                 srtCol=45,
                 srtRow=45,
                 cexCol=cexheat,
                 cexRow=cexheat
                 )
  par(op)
}

1.4.0.1 Correlation Matrix of the Data

The heat map of the data


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  #cormat <- Rfast::cora(as.matrix(dataframe[,varlist]),large=TRUE)
  cormat <- cor(dataframe[,varlist],method="pearson")
  cormat[is.na(cormat)] <- 0
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Original Correlation",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="|Pearson Correlation|",
                    xlab="Feature", ylab="Feature")
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

1.5 The decorrelation


DEdataframe <- IDeA(dataframe,verbose=TRUE,thr=thro)
#> 
#>  g1int824 g4D02 g1int718 g10E08 g1int23 g5B03 
#>     g2E09     g7F07     g1A01     g3C09     g3H08     g1A08 
#> 0.5669535 0.6771084 0.8096386 0.2306368 0.2034423 0.5913941 
#> 
#>  Included: 2905 , Uni p: 5.163511e-05 , Base Size: 14 , Rcrit: 0.2950683 
#> 
#> 
 1 <R=0.857,thr=0.950>, Top: 6< 5 >[Fa= 6 ]( 6 , 14 , 0 ),<|><>Tot Used: 20 , Added: 14 , Zero Std: 0 , Max Cor: 0.952
#> 
 2 <R=0.855,thr=0.950>, Top: 1< 1 >[Fa= 7 ]( 1 , 1 , 6 ),<|><>Tot Used: 22 , Added: 1 , Zero Std: 0 , Max Cor: 0.950
#> 
 3 <R=0.855,thr=0.900>, Top: 51< 3 >[Fa= 54 ]( 51 , 82 , 7 ),<|><>Tot Used: 149 , Added: 82 , Zero Std: 0 , Max Cor: 0.914
#> 
 4 <R=0.845,thr=0.900>, Top: 3< 1 >[Fa= 57 ]( 3 , 3 , 54 ),<|><>Tot Used: 155 , Added: 3 , Zero Std: 0 , Max Cor: 0.900
#> 
 5 <R=0.845,thr=0.800>, Top: 204< 9 >..[Fa= 232 ]( 203 , 460 , 57 ),<|><>Tot Used: 777 , Added: 460 , Zero Std: 0 , Max Cor: 0.897
#> 
 6 <R=0.834,thr=0.800>, Top: 42< 13 >[Fa= 271 ]( 42 , 74 , 232 ),<|><>Tot Used: 887 , Added: 74 , Zero Std: 0 , Max Cor: 0.864
#> 
 7 <R=0.840,thr=0.800>, Top: 5< 3 >[Fa= 276 ]( 5 , 9 , 271 ),<|><>Tot Used: 901 , Added: 9 , Zero Std: 0 , Max Cor: 0.800
#> 
 8 <R=0.800,thr=0.800>
#> 
 [ 8 ], 0.7999747 Decor Dimension: 901 Nused: 901 . Cor to Base: 618 , ABase: 2905 , Outcome Base: 0 
#> 
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]

pander::pander(sum(apply(dataframe[,varlist],2,var)))

61.1

pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))

46.4

pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))

3

pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))

2.85


varratio <- attr(DEdataframe,"VarRatio")

pander::pander(tail(varratio))
La_g4B02 La_g1int832 La_g1int829 La_g6A10 La_g1int823 La_g1int828
0.0602 0.0568 0.0551 0.0548 0.0401 0.0344

1.5.1 The decorrelation matrix


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  
  UPLTM <- attr(DEdataframe,"UPLTM")
  
  gplots::heatmap.2(1.0*(abs(UPLTM)>0),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Decorrelation matrix",
                    cexRow = cexheat,
                    cexCol = cexheat,
                   srtCol=45,
                   srtRow=45,
                    key.title=NA,
                    key.xlab="|Beta|>0",
                    xlab="Output Feature", ylab="Input Feature")
  
  par(op)
  
  
  
}

1.5.2 Formulas Network

Displaying the features associations

par(op)
clustable <- c("To many variables")


  transform <- attr(DEdataframe,"UPLTM") != 0
  tnames <- colnames(transform)
  colnames(transform) <- str_remove_all(colnames(transform),"La_")
  transform <- abs(transform*cor(dataframe[,rownames(transform)])) # The weights are proportional to the observed correlation
  
  
  fscore <- attr(DEdataframe,"fscore")
  VertexSize <- fscore # The size depends on the variable independence relevance (fscore)
  names(VertexSize) <- str_remove_all(names(VertexSize),"La_")
  VertexSize <- 10*(VertexSize-min(VertexSize))/(max(VertexSize)-min(VertexSize)) # Normalization

  VertexSize <- VertexSize[rownames(transform)]
  rsum <- apply(1*(transform !=0),1,sum) + 0.01*VertexSize + 0.001*varratio[tnames]
  csum <- apply(1*(transform !=0),2,sum) + 0.01*VertexSize + 0.001*varratio[tnames]
  
  ntop <- min(10,length(rsum))


  topfeatures <- unique(c(names(rsum[order(-rsum)])[1:ntop],names(csum[order(-csum)])[1:ntop]))
  rtrans <- transform[topfeatures,]
  csum <- (apply(1*(rtrans !=0),2,sum) > 1*(colnames(rtrans) %in% topfeatures))
  rtrans <- rtrans[,csum]
  topfeatures <- unique(c(topfeatures,colnames(rtrans)))
  print(ncol(transform))

[1] 901

  transform <- transform[topfeatures,topfeatures]
  print(ncol(transform))

[1] 164

  if (ncol(transform)>100)
  {
    csum <- apply(1*(transform !=0),1,sum) 
    csum <- csum[csum > 1]
    csum <- csum + 0.01*VertexSize[names(csum)]
    csum <- csum[order(-csum)]
    tpsum <- min(20,length(csum))
    trsum <- rownames(transform)[rownames(transform) %in% names(csum[1:tpsum])]
    rtrans <- transform[trsum,]
    topfeatures <- unique(c(rownames(rtrans),colnames(rtrans)))
    transform <- transform[topfeatures,topfeatures]
    if (nrow(transform) > 150)
    {
      csum <- apply(1*(rtrans != 0 ),2,sum)
      csum <- csum + 0.01*VertexSize[names(csum)]
      csum <- csum[order(-csum)]
      tpsum <- min(130,length(csum))
      csum <- rownames(transform)[rownames(transform) %in% names(csum[1:tpsum])]
      csum <- unique(c(trsum,csum))
      transform <- transform[csum,csum]
    }
    print(ncol(transform))
  }

[1] 130


    if (ncol(transform) < 150)
    {

      gplots::heatmap.2(transform,
                        trace = "none",
                        mar = c(5,5),
                        col=rev(heat.colors(5)),
                        main = "Red Decorrelation matrix",
                        cexRow = cexheat,
                        cexCol = cexheat,
                       srtCol=45,
                       srtRow=45,
                        key.title=NA,
                        key.xlab="|Beta|>0",
                        xlab="Output Feature", ylab="Input Feature")
  
      par(op)
      VertexSize <- VertexSize[colnames(transform)]
      gr <- graph_from_adjacency_matrix(transform,mode = "directed",diag = FALSE,weighted=TRUE)
      gr$layout <- layout_with_fr
      
#      fc <- cluster_optimal(gr)
        fc <- cluster_walktrap (gr,steps=50)
      plot(fc, gr,
           edge.width = 2*E(gr)$weight,
           vertex.size=VertexSize,
           edge.arrow.size=0.5,
           edge.arrow.width=0.5,
           vertex.label.cex=(0.15+0.05*VertexSize),
           vertex.label.dist=0.5 + 0.05*VertexSize,
           main="Top Feature Association")
      
      varratios <- varratio
      fscores <- fscore
      names(varratios) <- str_remove_all(names(varratios),"La_")
      names(fscores) <- str_remove_all(names(fscores),"La_")

      dc <- getLatentCoefficients(DEdataframe)
      theCharformulas <- attr(dc,"LatentCharFormulas")

      
      clustable <- as.data.frame(cbind(Variable=fc$names,
                                       Formula=as.character(theCharformulas[paste("La_",fc$names,sep="")]),
                                       Class=fc$membership,
                                       ResidualVariance=round(varratios[fc$names],3),
                                       Fscore=round(fscores[fc$names],3)
                                       )
                                 )
      rownames(clustable) <- str_replace_all(rownames(clustable),"__","_")
      clustable$Variable <- NULL
      clustable$Class <- as.integer(clustable$Class)
      clustable$ResidualVariance <- as.numeric(clustable$ResidualVariance)
      clustable$Fscore <- as.numeric(clustable$Fscore)
      clustable <- clustable[order(-clustable$Fscore),]
      clustable <- clustable[order(clustable$Class),]
      clustable <- clustable[clustable$Fscore >= -1,]
      topv <- min(50,nrow(clustable))
      clustable <- clustable[1:topv,]
    }


pander::pander(clustable)
  Formula Class ResidualVariance Fscore
g4D02 NA 1 1.000 12
g1int118 - (0.738)g4D02 + g1int118 1 0.351 2
g1int105 + g1int105 - (0.729)g4D02 1 0.310 -1
g1int106 + g1int106 - (0.700)g4D02 1 0.321 -1
g1int107 + g1int107 - (0.882)g4D02 1 0.275 -1
g1int108 + g1int108 - (0.798)g4D02 1 0.332 -1
g4H01 + g4H01 - (0.769)g4D02 1 0.167 -1
g4A02 + g4A02 - (0.873)g4D02 1 0.107 -1
g4B02 - (1.046)g4D02 + g4B02 1 0.060 -1
g1int109 - (0.795)g4D02 + g1int109 1 0.184 -1
g1int110 - (0.551)g4D02 + g1int110 1 0.360 -1
g1int111 - (0.886)g4D02 + g1int111 1 0.296 -1
g1int112 - (0.627)g4D02 + g1int112 1 0.344 -1
g1G04 + g1G04 - (1.139)g1int118 1 0.145 -1
g4E02 + g4E02 - (0.953)g1int118 1 0.126 -1
g1CNS316 - (0.961)g1int118 + g1CNS316 1 0.160 -1
g11G02 NA 2 1.000 10
g11E03 - (0.738)g11G02 + g11E03 2 0.278 1
g4E01 + g4E01 - (0.670)g11G02 2 0.315 0
g11E02 + g11E02 - (0.642)g11G02 2 0.346 -1
g11B02 + g11B02 - (0.787)g11G02 2 0.262 -1
g1int77 + g1int77 - (0.624)g11G02 2 0.354 -1
g7F01 + g7F01 - (0.687)g11G02 2 0.318 -1
g1D08 + g1D08 - (0.967)g4E01 2 0.137 -1
g9D05 + g9D05 - (0.546)g11G02 2 0.324 -1
g11A03 + g11A03 - (0.885)g11E03 2 0.148 -1
g11H02 + g11H02 - (0.748)g11G02 2 0.229 -1
g11F02 - (0.941)g11G02 + g11F02 2 0.123 -1
g1int85 - (0.653)g11G02 + g1int85 2 0.332 -1
g11C03 - (0.863)g11E03 + g11C03 2 0.180 -1
g4H02 NA 3 1.000 17
g1int120 + g1int120 - (0.802)g4H02 3 0.329 0
g4F02 + g4F02 - (0.845)g4H02 3 0.282 -1
g1int121 + g1int121 - (0.982)g1int120 3 0.189 -1
g1int123 + g1int123 - (0.734)g4H02 3 0.288 -1
g1CNS124 + g1CNS124 - (0.559)g4H02 3 0.344 -1
g1CNS201 + g1CNS201 - (0.762)g4H02 3 0.346 -1
g1CNS236 + g1CNS236 - (0.915)g4H02 3 0.195 -1
g1CNS219 + g1CNS219 - (0.918)g4H02 3 0.169 -1
g1int126 + g1int126 - (0.841)g4H02 3 0.278 -1
g3D03 + g3D03 - (0.960)g4H02 3 0.156 -1
g1CNS421 + g1CNS421 - (0.776)g4H02 3 0.241 -1
g1int127 + g1int127 - (0.890)g4H02 3 0.280 -1
g2B09 + g2B09 - (0.942)g4H02 3 0.274 -1
g4A03 - (0.899)g4H02 + g4A03 3 0.172 -1
g1CNS465 - (1.071)g4H02 + g1CNS465 3 0.168 -1
g1int129 - (0.951)g4H02 + g1int129 3 0.340 -1
g7B08 - (0.683)g4H02 + g7B08 3 0.312 -1
g1CNS159 NA 4 1.000 11
g1int846 + g1int846 - (0.935)g1CNS159 4 0.191 1

par(op)

1.6 The heatmap of the decorrelated data

if (!largeSet)
{

  hm <- heatMaps(data=DEdataframe[1:numsub,],
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 cexRow = cexheat,
                 cexCol = cexheat,
                 srtCol=45,
                 srtRow=45,
                 xlab="Feature",
                 ylab="Sample")
  par(op)
}

1.7 The correlation matrix after decorrelation

if (!largeSet)
{

  cormat <- cor(DEdataframe[,varlistc],method="pearson")
  cormat[is.na(cormat)] <- 0
  
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Correlation after ILAA",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="|Pearson Correlation|",
                    xlab="Feature", ylab="Feature")
  
  par(op)
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

1.8 U-MAP Visualization of features

1.8.1 The UMAP on Raw Data


  classes <- unique(dataframe[1:numsub,outcome])
  raincolors <- rainbow(length(classes))
  names(raincolors) <- classes
  topvars <- univariate_BinEnsemble(dataframe,outcome)
  lso <- LASSO_MIN(formula(paste(outcome,"~.")),dataframe,family="binomial")
  topvars <- unique(c(names(topvars),lso$selectedfeatures))
  pander::pander(head(topvars))

g1CNS507, g1CNS382, g1CNS91, g1CNS105, g1CNS26 and g1int804

#  names(topvars)
#if (nrow(dataframe) < 1000)
#{
  datasetframe.umap = umap(scale(dataframe[1:numsub,topvars]),n_components=2)
#  datasetframe.umap = umap(dataframe[1:numsub,varlist],n_components=2)
  plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
  text(datasetframe.umap$layout,labels=dataframe[1:numsub,outcome],col=raincolors[dataframe[1:numsub,outcome]+1])

#}

1.8.2 The decorralted UMAP

  varlistcV <- names(varratio[varratio >= 0.01])
  topvars <- univariate_BinEnsemble(DEdataframe[,varlistcV],outcome)
  lso <- LASSO_MIN(formula(paste(outcome,"~.")),DEdataframe[,varlistcV],family="binomial")
  topvars <- unique(c(names(topvars),lso$selectedfeatures))
  pander::pander(head(topvars))

g1CNS507, g1int340, g1int812, g9E01, g8D02 and g1CNS29


  varlistcV <- varlistcV[varlistcV != outcome]
  
#  DEdataframe[,outcome] <- as.numeric(DEdataframe[,outcome])
#if (nrow(dataframe) < 1000)
#{
  datasetframe.umap = umap(scale(DEdataframe[1:numsub,topvars]),n_components=2)
#  datasetframe.umap = umap(DEdataframe[1:numsub,varlistcV],n_components=2)
  plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After ILAA",t='n')
  text(datasetframe.umap$layout,labels=DEdataframe[1:numsub,outcome],col=raincolors[DEdataframe[1:numsub,outcome]+1])

#}

1.9 Univariate Analysis

1.9.1 Univariate



univarRAW <- uniRankVar(varlist,
               paste(outcome,"~1"),
               outcome,
               dataframe,
               rankingTest="AUC")

100 : g1A07 200 : g3H03 300 : g1E07 400 : g4F03 500 : g4C05
600 : g1int292 700 : g1int356 800 : g7A03 900 : g4D11 1000 : g1int577
1100 : g1CNS420 1200 : g7G07 1300 : g1int785 1400 : g1CNS59 1500 : g1CNS178
1600 : g1int949 1700 : g1int1028 1800 : g1int1089 1900 : g11D05 2000 : g1int1222
2100 : g1int1298 2200 : g1int1376 2300 : g1int1449 2400 : g10E08 2500 : g1CNS90
2600 : g7F11 2700 : g1int1693 2800 : g1CNS93 2900 : g1int1800




univarDe <- uniRankVar(varlistc,
               paste(outcome,"~1"),
               outcome,
               DEdataframe,
               rankingTest="AUC",
               )

100 : g1A07 200 : g3H03 300 : La_g1E07 400 : g4F03 500 : g4C05
600 : La_g1int292 700 : g1int356 800 : g7A03 900 : g4D11 1000 : g1int577
1100 : g1CNS420 1200 : La_g7G07 1300 : g1int785 1400 : La_g1CNS59 1500 : g1CNS178
1600 : g1int949 1700 : La_g1int1028 1800 : La_g1int1089 1900 : g11D05 2000 : g1int1222
2100 : g1int1298 2200 : g1int1376 2300 : La_g1int1449 2400 : g10E08 2500 : g1CNS90
2600 : g7F11 2700 : g1int1693 2800 : La_g1CNS93 2900 : g1int1800

1.9.2 Final Table


univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")

##top variables
topvar <- c(1:length(varlist)) <= TopVariables
tableRaw <- univarRAW$orderframe[topvar,univariate_columns]
pander::pander(tableRaw)
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
g1CNS507 -0.250 0.222 -0.0248 0.195 0.075349 0.796
g1CNS105 0.254 0.270 0.0533 0.172 0.009926 0.749
g1CNS382 -0.171 0.173 -0.0354 0.145 0.045504 0.745
g1int804 -0.250 0.220 -0.0775 0.205 0.020286 0.743
g1CNS91 0.310 0.294 0.0994 0.138 0.003638 0.742
g1CNS26 -0.145 0.169 -0.0101 0.125 0.657242 0.738
g1int340 -0.157 0.183 -0.0169 0.139 0.028840 0.737
g1CNS70 0.152 0.177 0.0352 0.115 0.022558 0.731
g1CNS158 0.260 0.251 0.1007 0.167 0.000401 0.731
g1CNS28 0.218 0.218 0.0744 0.143 0.001961 0.726


topLAvar <- univarDe$orderframe$Name[str_detect(univarDe$orderframe$Name,"La_")]
topLAvar <- unique(c(univarDe$orderframe$Name[topvar],topLAvar[1:as.integer(TopVariables/2)]))
finalTable <- univarDe$orderframe[topLAvar,univariate_columns]


pander::pander(finalTable)
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
g1CNS507 -0.249546 0.2224 -0.024786 0.1946 0.075349 0.796
g1int340 -0.157144 0.1829 -0.016911 0.1391 0.028840 0.737
g8D02 -0.086458 0.1772 0.027399 0.1370 0.019618 0.725
g1CNS159 0.192334 0.2488 0.046005 0.1735 0.000826 0.723
g1int1671 0.126334 0.1404 0.039686 0.1218 0.099178 0.719
g9E01 0.091105 0.1633 -0.004645 0.0984 0.039398 0.717
g1int812 -0.192608 0.2140 -0.046220 0.1755 0.235071 0.716
g1CNS74 0.124918 0.1377 0.039057 0.1157 0.163757 0.716
g8F04 0.022967 0.0898 -0.046750 0.0893 0.805754 0.710
g5G03 -0.113922 0.1569 -0.014816 0.1450 0.040736 0.706
La_g1int829 0.027457 0.0662 -0.018160 0.0654 0.686753 0.700
La_g4F01 0.038348 0.0752 -0.000825 0.0780 0.311967 0.677
La_g1int1102 -0.075044 0.0969 -0.020829 0.0888 0.626136 0.674
La_g6H03 0.000405 0.1480 -0.045865 0.0647 0.370420 0.667
La_g1CNS91 0.081214 0.1814 0.012656 0.0845 0.130390 0.667

dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")


pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
mean total fraction
2 643 0.221

theCharformulas <- attr(dc,"LatentCharFormulas")

topvar <- rownames(tableRaw)
finalTable <- rbind(finalTable,tableRaw[topvar[!(topvar %in% topLAvar)],univariate_columns])


orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- theCharformulas[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]
finalTable$varratio <- varratio[rownames(finalTable)]

Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores","varratio")

finalTable <- finalTable[order(-finalTable$ROCAUC),]
pander::pander(finalTable[,Final_Columns])
  DecorFormula caseMean caseStd controlMean controlStd controlKSP ROCAUC RAWAUC fscores varratio
g1CNS507 NA -0.249546 0.2224 -0.024786 0.1946 0.075349 0.796 0.796 0 1.0000
g1CNS105 NA 0.254206 0.2698 0.053317 0.1720 0.009926 0.749 0.749 NA NA
g1CNS382 NA -0.171019 0.1731 -0.035377 0.1449 0.045504 0.745 0.745 NA NA
g1int804 NA -0.249725 0.2195 -0.077451 0.2051 0.020286 0.743 0.743 NA NA
g1CNS91 NA 0.309879 0.2941 0.099384 0.1383 0.003638 0.742 0.742 NA NA
g1CNS26 NA -0.144676 0.1686 -0.010129 0.1253 0.657242 0.738 0.738 NA NA
g1int340 NA -0.157144 0.1829 -0.016911 0.1391 0.028840 0.737 0.737 0 1.0000
g1CNS70 NA 0.152301 0.1769 0.035188 0.1152 0.022558 0.731 0.731 NA NA
g1CNS158 NA 0.259813 0.2505 0.100672 0.1674 0.000401 0.731 0.731 NA NA
g1CNS28 NA 0.218429 0.2175 0.074444 0.1432 0.001961 0.726 0.726 NA NA
g8D02 NA -0.086458 0.1772 0.027399 0.1370 0.019618 0.725 0.725 0 1.0000
g1CNS159 NA 0.192334 0.2488 0.046005 0.1735 0.000826 0.723 0.723 11 1.0000
g1int1671 NA 0.126334 0.1404 0.039686 0.1218 0.099178 0.719 0.719 0 1.0000
g9E01 NA 0.091105 0.1633 -0.004645 0.0984 0.039398 0.717 0.717 0 1.0000
g1int812 NA -0.192608 0.2140 -0.046220 0.1755 0.235071 0.716 0.716 12 1.0000
g1CNS74 NA 0.124918 0.1377 0.039057 0.1157 0.163757 0.716 0.716 0 1.0000
g8F04 NA 0.022967 0.0898 -0.046750 0.0893 0.805754 0.710 0.710 0 1.0000
g5G03 NA -0.113922 0.1569 -0.014816 0.1450 0.040736 0.706 0.706 0 1.0000
La_g1int829 - (1.024)g1int834 + g1int829 0.027457 0.0662 -0.018160 0.0654 0.686753 0.700 0.617 -1 0.0551
La_g4F01 + g4F01 - (0.613)g1int98 0.038348 0.0752 -0.000825 0.0780 0.311967 0.677 0.617 -1 0.3057
La_g1int1102 - (0.868)g1int1101 + g1int1102 -0.075044 0.0969 -0.020829 0.0888 0.626136 0.674 0.644 -1 0.2744
La_g6H03 + g6H03 - (1.057)g1int1515 0.000405 0.1480 -0.045865 0.0647 0.370420 0.667 0.523 -1 0.1964
La_g1CNS91 - (0.985)g5F04 + g1CNS91 0.081214 0.1814 0.012656 0.0845 0.130390 0.667 0.742 0 0.3255

1.10 Comparing ILAA vs PCA vs EFA

1.10.1 PCA

featuresnames <- colnames(dataframe)[colnames(dataframe) != outcome]
pc <- prcomp(dataframe[,iscontinous],center = TRUE,scale. = TRUE,tol=0.01)   #principal components
predPCA <- predict(pc,dataframe[,iscontinous])
PCAdataframe <- as.data.frame(cbind(predPCA,dataframe[,!iscontinous]))
colnames(PCAdataframe) <- c(colnames(predPCA),colnames(dataframe)[!iscontinous]) 
#plot(PCAdataframe[,colnames(PCAdataframe)!=outcome],col=dataframe[,outcome],cex=0.65,cex.lab=0.5,cex.axis=0.75,cex.sub=0.5,cex.main=0.75)

#pander::pander(pc$rotation)


PCACor <- cor(PCAdataframe[,colnames(PCAdataframe) != outcome])


  gplots::heatmap.2(abs(PCACor),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "PCA Correlation",
                    cexRow = 0.5,
                    cexCol = 0.5,
                     srtCol=45,
                     srtRow= -45,
                    key.title=NA,
                    key.xlab="Pearson Correlation",
                    xlab="Feature", ylab="Feature")

1.10.2 EFA


EFAdataframe <- dataframeScaled

if (length(iscontinous) < 2000)
{
  topred <- min(length(iscontinous),nrow(dataframeScaled),ncol(predPCA)-1)
  if (topred < 2) topred <- 2
  
  uls <- fa(dataframeScaled[,iscontinous],nfactors=topred,rotate="varimax",warnings=FALSE)  # EFA analysis
  predEFA <- predict(uls,dataframeScaled[,iscontinous])
  EFAdataframe <- as.data.frame(cbind(predEFA,dataframeScaled[,!iscontinous]))
  colnames(EFAdataframe) <- c(colnames(predEFA),colnames(dataframeScaled)[!iscontinous]) 


  
  EFACor <- cor(EFAdataframe[,colnames(EFAdataframe) != outcome])
  
  
    gplots::heatmap.2(abs(EFACor),
                      trace = "none",
    #                  scale = "row",
                      mar = c(5,5),
                      col=rev(heat.colors(5)),
                      main = "EFA Correlation",
                      cexRow = 0.5,
                      cexCol = 0.5,
                       srtCol=45,
                       srtRow= -45,
                      key.title=NA,
                      key.xlab="Pearson Correlation",
                      xlab="Feature", ylab="Feature")
}

1.11 Effect on CAR modeling

par(op)
par(xpd = TRUE)
dataframe[,outcome] <- factor(dataframe[,outcome])
rawmodel <- rpart(paste(outcome,"~."),dataframe,control=rpart.control(maxdepth=3))
pr <- predict(rawmodel,dataframe,type = "class")

  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(rawmodel,main="Raw",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(rawmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,dataframe[,outcome]==0))
  }


pander::pander(table(dataframe[,outcome],pr))
  0 1
0 107 4
1 10 47
pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.917 0.864 0.954
3 se 0.825 0.701 0.913
4 sp 0.964 0.910 0.990
6 diag.or 125.725 37.521 421.276

par(op)
par(xpd = TRUE)
DEdataframe[,outcome] <- factor(DEdataframe[,outcome])
IDeAmodel <- rpart(paste(outcome,"~."),DEdataframe[,c(outcome,varlistcV)],control=rpart.control(maxdepth=3))
pr <- predict(IDeAmodel,DEdataframe,type = "class")

  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(IDeAmodel,main="ILAA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(IDeAmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,DEdataframe[,outcome]==0))
  }

pander::pander(table(DEdataframe[,outcome],pr))
  0 1
0 103 8
1 6 51
pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.917 0.864 0.954
3 se 0.895 0.785 0.960
4 sp 0.928 0.863 0.968
6 diag.or 109.438 36.051 332.214

par(op)
par(xpd = TRUE)
PCAdataframe[,outcome] <- factor(PCAdataframe[,outcome])
PCAmodel <- rpart(paste(outcome,"~."),PCAdataframe,control=rpart.control(maxdepth=3))
pr <- predict(PCAmodel,PCAdataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
  plot(PCAmodel,main="PCA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
  text(PCAmodel, use.n = TRUE,cex=0.75)
  ptab <- epiR::epi.tests(table(pr==0,PCAdataframe[,outcome]==0))
}

pander::pander(table(PCAdataframe[,outcome],pr))
  0 1
0 105 6
1 27 30
pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.804 0.735 0.861
3 se 0.526 0.390 0.660
4 sp 0.946 0.886 0.980
6 diag.or 19.444 7.347 51.459


par(op)

1.11.1 EFA


  EFAdataframe[,outcome] <- factor(EFAdataframe[,outcome])
  EFAmodel <- rpart(paste(outcome,"~."),EFAdataframe,control=rpart.control(maxdepth=3))
  pr <- predict(EFAmodel,EFAdataframe,type = "class")
  
  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(EFAmodel,main="EFA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(EFAmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,EFAdataframe[,outcome]==0))
  }


  pander::pander(table(EFAdataframe[,outcome],pr))
  0 1
0 107 4
1 10 47
  pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.917 0.864 0.954
3 se 0.825 0.701 0.913
4 sp 0.964 0.910 0.990
6 diag.or 125.725 37.521 421.276
  par(op)